filter_features <- function(df_train_sm){
df_fs <- df_train_sm %>%
select(-engagement) %>%
select(-content_crashes) %>%
select(-client_id) %>%
select(-label_beta) %>%
select(-label_release) %>%
select(-is_release) %>%
select(-app_version)
return(df_fs)
}This analysis is focused on utilizing Boruta as a initial pre-filter to the covariates, to narrow the feature selection search space.
Apply Boruta to each performance covariate.
engagement <- c('active_hours','active_hours_max','uri_count','uri_count_max','search_count','search_count_max','num_pages','num_pages_max','daily_max_tabs','daily_max_tabs_max','daily_unique_domains','daily_unique_domains_max','daily_tabs_opened','daily_tabs_opened_max')
df_train_sm_1 <- df_train_encoder %>%
sample_n(1000)
df_train_sm_2 <- df_train_encoder %>%
sample_n(1000)
df_train_sm_3 <- df_train_encoder %>%
sample_n(1000)
#df_train_sm_4 <- df_train_encoder %>% sample_n(1000)
#df_train_sm_5 <- df_train_encoder %>% sample_n(1000)
df_fs_1 <- filter_features(df_train_sm_1)
df_fs_2 <- filter_features(df_train_sm_2)
df_fs_3 <- filter_features(df_train_sm_3)
#df_fs_4 <- filter_features(df_train_sm_4)
#df_fs_5 <- filter_features(df_train_sm_5)Boruta is a feature selection algorithm based on the random forest algorithm. In the process of deciding if a feature is important or not, some features may be marked as Tentative. Sometimes increasing the maxRuns can help resolve the Tentativeness of the feature.
boruta_results_1 <- list()
boruta_results_2 <- list()
boruta_results_3 <- list()
#boruta_results_4 <- list()
#boruta_results_5 <- list()
for (metric in engagement){
print(paste('Applying Boruta to ', metric))
boruta.out <- Boruta(y = df_train_sm_1[[metric]], x=df_fs_1, doTrace=0)
boruta_results_1[[metric]] <- boruta.out
boruta.out <- Boruta(y = df_train_sm_2[[metric]], x=df_fs_2, doTrace=0)
boruta_results_2[[metric]] <- boruta.out
boruta.out <- Boruta(y = df_train_sm_3[[metric]], x=df_fs_3, doTrace=0)
boruta_results_3[[metric]] <- boruta.out
#boruta.out <- Boruta(y = df_train_sm_4[[metric]], x=df_fs_4, doTrace=0)
#boruta_results_4[[metric]] <- boruta.out
#boruta.out <- Boruta(y = df_train_sm_5[[metric]], x=df_fs_5, doTrace=0)
#boruta_results_5[[metric]] <- boruta.out
}## [1] "Applying Boruta to active_hours"
## [1] "Applying Boruta to active_hours_max"
## [1] "Applying Boruta to uri_count"
## [1] "Applying Boruta to uri_count_max"
## [1] "Applying Boruta to search_count"
## [1] "Applying Boruta to search_count_max"
## [1] "Applying Boruta to num_pages"
## [1] "Applying Boruta to num_pages_max"
## [1] "Applying Boruta to daily_max_tabs"
## [1] "Applying Boruta to daily_max_tabs_max"
## [1] "Applying Boruta to daily_unique_domains"
## [1] "Applying Boruta to daily_unique_domains_max"
## [1] "Applying Boruta to daily_tabs_opened"
## [1] "Applying Boruta to daily_tabs_opened_max"
for (metric in engagement){
plot(boruta_results_1[[metric]], cex.axis=.7, las=2, xlab="", main=metric)
plot(boruta_results_2[[metric]], cex.axis=.7, las=2, xlab="", main=metric)
plot(boruta_results_3[[metric]], cex.axis=.7, las=2, xlab="", main=metric)
}Find the top 5 ranking features per metric, and add to a list.
features_top5_1 <- NULL
features_top5_2 <- NULL
features_top5_3 <- NULL
for(metric in engagement){
features_top5_1 <- c(names(sort(apply(boruta_results_1[[metric]]$ImpHistory, 2, median), decreasing = TRUE)[1:5]), features_top5_1)
features_top5_2 <- c(names(sort(apply(boruta_results_2[[metric]]$ImpHistory, 2, median), decreasing = TRUE)[1:5]), features_top5_2)
features_top5_3 <- c(names(sort(apply(boruta_results_3[[metric]]$ImpHistory, 2, median), decreasing = TRUE)[1:5]), features_top5_3)
}
top5_1 <- sort(c(unique(features_top5_1)))
top5_2 <- sort(c(unique(features_top5_2)))
top5_3 <- sort(c(unique(features_top5_3)))
n <- max(length(top5_1), length(top5_2), length(top5_3))
length(top5_1) <- n
length(top5_2) <- n
length(top5_3) <- n
x <- data.frame(top5_1, top5_2, top5_3)
kable(x) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
scroll_box(width = "100%")| top5_1 | top5_2 | top5_3 |
|---|---|---|
| daily_num_sessions_started | cpu_cores | daily_num_sessions_started |
| daily_num_sessions_started_max | daily_num_sessions_started | daily_num_sessions_started_max |
| FX_PAGE_LOAD_MS_2_PARENT | daily_num_sessions_started_max | FX_PAGE_LOAD_MS_2_PARENT |
| num_active_days | FX_PAGE_LOAD_MS_2_PARENT | memory_mb |
| num_bookmarks | memory_cat | num_active_days |
| profile_age | memory_mb | num_bookmarks |
| profile_age_cat | num_active_days | profile_age |
| session_length | num_bookmarks | profile_age_cat |
| session_length_max | profile_age | session_length |
| TIME_TO_DOM_COMPLETE_MS | profile_age_cat | session_length_max |
| TIME_TO_DOM_CONTENT_LOADED_END_MS | session_length | startup_ms_max |
| TIME_TO_DOM_INTERACTIVE_MS | session_length_max | TIME_TO_DOM_COMPLETE_MS |
| TIME_TO_LOAD_EVENT_END_MS | TIME_TO_DOM_CONTENT_LOADED_END_MS | TIME_TO_DOM_INTERACTIVE_MS |
| TIME_TO_NON_BLANK_PAINT_MS | TIME_TO_DOM_INTERACTIVE_MS | TIME_TO_LOAD_EVENT_END_MS |
| timezone_cat_(4,6] | TIME_TO_LOAD_EVENT_END_MS | TIME_TO_NON_BLANK_PAINT_MS |
| NA | TIME_TO_NON_BLANK_PAINT_MS | NA |
| NA | timezone_cat_(6,8] | NA |
Increasing to 10:
features_top10_1 <- NULL
features_top10_2 <- NULL
features_top10_3 <- NULL
for(metric in engagement){
features_top10_1 <- c(names(sort(apply(boruta_results_1[[metric]]$ImpHistory, 2, median), decreasing = TRUE)[1:10]), features_top10_1)
features_top10_2 <- c(names(sort(apply(boruta_results_2[[metric]]$ImpHistory, 2, median), decreasing = TRUE)[1:10]), features_top10_2)
features_top10_3 <- c(names(sort(apply(boruta_results_3[[metric]]$ImpHistory, 2, median), decreasing = TRUE)[1:10]), features_top10_3)
}
top10_1 <- sort(c(unique(features_top10_1)))
top10_2 <- sort(c(unique(features_top10_2)))
top10_3 <- sort(c(unique(features_top10_3)))
n <- max(length(top10_1), length(top10_2), length(top10_3))
length(top10_1) <- n
length(top10_2) <- n
length(top10_3) <- n
x <- data.frame(top10_1, top10_2, top10_3)
kable(x) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
scroll_box(width = "100%")| top10_1 | top10_2 | top10_3 |
|---|---|---|
| daily_num_sessions_started | cpu_cores | daily_num_sessions_started |
| daily_num_sessions_started_max | cpu_speed_mhz | daily_num_sessions_started_max |
| FX_PAGE_LOAD_MS_2_PARENT | daily_num_sessions_started | FX_PAGE_LOAD_MS_2_PARENT |
| memory_mb | daily_num_sessions_started_max | memory_cat |
| num_active_days | FX_PAGE_LOAD_MS_2_PARENT | memory_mb |
| num_bookmarks | memory_cat | num_active_days |
| profile_age | memory_mb | num_bookmarks |
| profile_age_cat | num_active_days | profile_age |
| session_length | num_addons | profile_age_cat |
| session_length_max | num_bookmarks | session_length |
| startup_ms | profile_age | session_length_max |
| startup_ms_max | profile_age_cat | startup_ms |
| TIME_TO_DOM_COMPLETE_MS | session_length | startup_ms_max |
| TIME_TO_DOM_CONTENT_LOADED_END_MS | session_length_max | TIME_TO_DOM_COMPLETE_MS |
| TIME_TO_DOM_INTERACTIVE_MS | startup_ms | TIME_TO_DOM_CONTENT_LOADED_END_MS |
| TIME_TO_LOAD_EVENT_END_MS | startup_ms_max | TIME_TO_DOM_INTERACTIVE_MS |
| TIME_TO_NON_BLANK_PAINT_MS | TIME_TO_DOM_COMPLETE_MS | TIME_TO_LOAD_EVENT_END_MS |
| timezone_cat_(4,6] | TIME_TO_DOM_CONTENT_LOADED_END_MS | TIME_TO_NON_BLANK_PAINT_MS |
| NA | TIME_TO_DOM_INTERACTIVE_MS | timezone_offset |
| NA | TIME_TO_LOAD_EVENT_END_MS | NA |
| NA | TIME_TO_NON_BLANK_PAINT_MS | NA |
| NA | timezone_cat_(6,8] | NA |
As we can see, using different samples hardly changes the result. Therefore, we can use only one df with 1000 samples. Equalize by label, then perform the above.
df_beta <- df_train_encoder %>%
filter(label_beta == 1)
n_beta <- nrow(df_beta)
set.seed(1234)
df_rel <- df_train_encoder %>%
filter(label_beta == 0) %>%
sample_n(n_beta)
set.seed(1234)
df_train_f_sm_eq <- df_rel %>%
bind_rows(df_beta) %>%
sample_n(1000)
df_fs_eq <- df_train_f_sm_eq %>%
select(-engagement) %>%
select(-content_crashes) %>%
select(-client_id) %>%
select(-label_beta) %>%
select(-label_release) %>%
select(-is_release) %>%
select(-app_version)boruta_results_eq <- list()
for (metric in engagement){
print(paste('Applying Boruta to ', metric))
boruta.out <- Boruta(y = df_train_f_sm_eq[[metric]], x=df_fs_eq, doTrace=0)
boruta_results_eq[[metric]] <- boruta.out
# plot(boruta.out, cex.axis=.7, las=2, xlab="", main=metric)
}## [1] "Applying Boruta to active_hours"
## [1] "Applying Boruta to active_hours_max"
## [1] "Applying Boruta to uri_count"
## [1] "Applying Boruta to uri_count_max"
## [1] "Applying Boruta to search_count"
## [1] "Applying Boruta to search_count_max"
## [1] "Applying Boruta to num_pages"
## [1] "Applying Boruta to num_pages_max"
## [1] "Applying Boruta to daily_max_tabs"
## [1] "Applying Boruta to daily_max_tabs_max"
## [1] "Applying Boruta to daily_unique_domains"
## [1] "Applying Boruta to daily_unique_domains_max"
## [1] "Applying Boruta to daily_tabs_opened"
## [1] "Applying Boruta to daily_tabs_opened_max"
for (metric in names(boruta_results_eq)){
plot(boruta_results_eq[[metric]], cex.axis=.7, las=2, xlab="", main=metric)
}features_top5 <- NULL
for(metric in names(boruta_results_eq)){
features_top5 <- c(names(sort(apply(boruta_results_eq[[metric]]$ImpHistory, 2, median), decreasing = TRUE)[1:5]), features_top5)
}
x <- data.frame(top5 = sort(c(unique(features_top5))))
kable(x) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
scroll_box(width = "100%")| top5 |
|---|
| daily_num_sessions_started |
| daily_num_sessions_started_max |
| FX_PAGE_LOAD_MS_2_PARENT |
| memory_mb |
| num_active_days |
| num_addons |
| num_bookmarks |
| profile_age |
| profile_age_cat |
| session_length |
| session_length_max |
| TIME_TO_DOM_COMPLETE_MS |
| TIME_TO_DOM_INTERACTIVE_MS |
| TIME_TO_LOAD_EVENT_END_MS |
| TIME_TO_NON_BLANK_PAINT_MS |
Increasing to 10:
features_top10 <- NULL
for(metric in names(boruta_results_eq)){
features_top10 <- c(names(sort(apply(boruta_results_eq[[metric]]$ImpHistory, 2, median), decreasing = TRUE)[1:10]), features_top10)
}
x <- data.frame(top10 = sort(c(unique(features_top10))))
kable(x) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
scroll_box(width = "100%")| top10 |
|---|
| cpu_speed_mhz |
| daily_num_sessions_started |
| daily_num_sessions_started_max |
| FX_PAGE_LOAD_MS_2_PARENT |
| memory_mb |
| num_active_days |
| num_addons |
| num_bookmarks |
| profile_age |
| profile_age_cat |
| session_length |
| session_length_max |
| startup_ms |
| startup_ms_max |
| TIME_TO_DOM_COMPLETE_MS |
| TIME_TO_DOM_CONTENT_LOADED_END_MS |
| TIME_TO_DOM_INTERACTIVE_MS |
| TIME_TO_LOAD_EVENT_END_MS |
| TIME_TO_NON_BLANK_PAINT_MS |
| timezone_cat_(4,6] |
| timezone_offset |
save.image(file = "feature_selection.RData")